Import Libraries



In [1]:

    
# Libraries for the analysis
from gensim import corpora, models, similarities
import numpy
import re
from nltk.stem import WordNetLemmatizer as WNL
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

# Libraries for the presentation
from IPython.display import display, HTML
from ipywidgets import widgets
from IPython.display import clear_output

# Download Necessary Stemmer
import nltk
nltk.download('wordnet')

Define Functions / Global Variables



In [18]:

    
### Functions ###
# For analysis
def stem_of_(word):
    return(wnl.lemmatize(word))       # Find the stem of word (e.g., "sales" -> "sale")

def clean_words_in_(document):
    return(re.split(r'(?:[^a-z0-9]*(?!\')[^a-z0-9]+)',document.lower())) # Turn string into lowercases, then split into
                                                                         # words and remove all punctuations except 
                                                                         # the apostrophe (')
        
def color_words(model, doc):
    # make into bag of words
    doc = tfidf[model.id2word.doc2bow(doc)]
    # get word_topics
    doc_topics, word_topics, phi_values = model.get_document_topics(doc, per_word_topics=True)
    dic = {}
    for word, topics in word_topics:
        if topics!=[]:
            dic[model.id2word[word]]=topics[0]
        #print(model.id2word[word])
        #print(topics)
    return dic

# For presentation
def start_over(btn):
    global curPage
    curPage = 1
    clear_output()
    display(btn_previous_page)
    display(btn_next_page)
    btn_start_over.close()
    page_content()

def previous_page(btn):
    global curPage
    curPage = max(1,curPage-1)
    clear_output()
    page_content()

def next_page(btn):
    global curPage
    curPage = min(totalPage,curPage+1)
    clear_output()
    page_content()
    
def page_content():
    if curPage == 1:
        page_1_display()
    elif curPage == 2:
        page_2_display()
    elif curPage == 3:
        page_3_display()
    elif curPage == 4:
        page_4_display()
    elif curPage == 5:
        page_5_display()
    elif curPage == 6:
        page_6_display()
    elif curPage == 7:
        page_7_display()
    else:
        page_8_display()

def bold(s):
    return "<b>"+s+"</b>"

def strikethrough(s):
    return "<span style=\"text-decoration: line-through;\">"+s+"</span>"

def color(s, color):
    return "<span style=\"color:"+color+";\">"+s+"</span>"

def get_color(cols, percentages):
    r = 0
    g = 0
    b = 0
    for idx, col in enumerate(cols):
        r += col[0]*percentages[idx]
        g += col[1]*percentages[idx]
        b += col[2]*percentages[idx]
    return "rgb("+str(int(r))+","+str(int(g))+","+str(int(b))+")"

def fake_probability(score_list):
    score_list = [(i+1)/2 for i in score_list]
    total = 0
    for score in score_list:
        total += score
    score_list = [i/total for i in score_list]
    return score_list

# Actual cont of each page
def page_1_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">Here we present some sample documents, each of which is revised from a \
    news title on the internet.</br>The subject \"apple\" is in all these 6 titles, while in the first three lines it means \
    the fruit, and in the next three lines it means the tech company.</b><br />"
    for doc in documents:
        content += "<p>" + str(doc) + "</p>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))

def page_2_display():         
    # Build html display
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">Clean each document into individual words and remove words that \
    contributes little to the comprehension of the documents.<br />Now let's take a look at these processed documents:\
    </b><br />"
    # Get contents to be highlighted
    for doc in documents:
        content += "<p>"
        for w in doc.split(" "):
            s = str(w)
            if s not in dict_raw_to_clean:
                s = color(s,"rgb(199,33,93)")
                s = strikethrough(s)
            content += s+" "
        content = content.strip() + " -> "
        filtered_doc = [stem_of_(word)
                       for word in clean_words_in_(doc)
                       if word not in stoplist]
        for w in filtered_doc:
            content += w+" "
        content.strip()
        content += "</p>"
    content += "<br /><b style=\"color:rgb(64,128,128);\">Looking good! Our documents are ready for further analysis \
    in the next step.</b>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))
    
def page_3_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">In order to build the semantics model, a \"dictioanry\" containing all \
    words and phrases appeared in the documents is required.<br />This dictionary maps each word or phrase (\"token\") \
    to its unique id.</b><br />"
    for doc in filtered_documents:
        content += "<p>"
        for w in doc:
            content += w+"<sup>("+str(dictionary.token2id[w])+")</sup> "
        content.strip()
        content += "</p>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))
    
def page_4_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">With dictionary in hand, the collection of documents can then be \
    translated into a vector form. This is called a corpus.<br />The form of representation in the corpus is \
    called \"bag-of-words\". Each pair (x, y) means that in this document, the word with id \"x\" has a frequency \
    of \"y\".</b><br />"
    for doc in corpus:
        content += "<p>"
        for t in doc:
            w = str(dictionary.id2token[t[0]])
            content += w+": "+str(t)+"; "
        content.strip()
        content += "</p>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))

def page_5_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">After all the preparation work, we can now build the model to \
    recognize topics. In this demo, Latent Dirichlet Allocation (LDA) is used to transform each document from \
    \"vector of words\" to \"vector of topics\". We already know that there are 2 topics: fruit and technology, so we \
    set this into the input of our model.<br />Here are the lists of (top 5) words that best describe the topics:</b><br />"
    for idx, topic in enumerate(model.show_topics()):
        p = [0]*n_topics
        p[idx] = 1
        col = get_color(cols,p)
        content += "<p style=\"color:"+col+";\">Topic"+str(idx)+": "
        for i, doc in enumerate(model.show_topic(idx, topn=5)):
            w = doc[0]
            weight = doc[1]
            content += w +" (" + "{0:.2f}".format(weight) + ")" + "; "
        content.strip()
        content += "</p>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))

def page_6_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">This shows which topic each word belongs to:</b><br />"
    for idx, doc in enumerate(filtered_documents):
        content += "<p>"
        color_dic = color_words(model,doc)
        for w in doc:
            p = [0]*n_topics
            if w in color_dic:
                p[color_dic[w]] = 1
            col = get_color(cols,p)
            content += color(w,col)+" "
        content.strip()
        content += "</p>"    
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))
    
def page_7_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">Just as what we would expect, the first three documents are in \
    topic 0 and the other three in topic 1.<br />A score that indicates how likely the document belongs to the topic \
    is also provided.</b><br />"
    for idx, doc in enumerate(model[corpus_tfidf]):
        idx_list = [i[0] for i in doc]
        score_list = [i[1] for i in doc]
        i = score_list.index(max(score_list))
        p = [0]*n_topics
        p[idx_list[i]] = 1
        col = get_color(cols,p)
        content += "<p style=\"color:"+col+";\">" + str(documents[idx]) +" (" + \
        str("{0:.0f}%".format(max(score_list)*100)) + ")</p>"
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))

def page_8_display():
    content = ""
    content += "<b style=\"color:rgb(64,128,128);\">Now for the new documents, we do the similar thing:</b><br />"
    new_docs = [doc1, doc2, doc3]
    for idx, doc in enumerate(new_docs):
        content += "<p>"
        color_dic = color_words(model,doc)
        for w in doc:
            p = [0]*n_topics
            if w in color_dic:
                p[color_dic[w]] = 1
            col = get_color(cols,p)
            content += color(w,col)+" "
        content.strip()
        content += "</p>"
    content += "<br /><b style=\"color:rgb(64,128,128);\">While the first two documents clearly belong to topic 1 and 0, \
    the third document is somewhat in between (as we expected).</b><br />"   
    content += "<br /><p style=\"text-align:right\">" + str(curPage) + "</p>"
    display(HTML(content))
    
### Global Variables ###
# For analysis
n_topics = 2
wnl = WNL()
stoplist = None
dict_raw_to_clean = None
filtered_documents = None
dictionary = None
corpus = None
tfidf = None
corpus_tfidf = None
model = None

# For presentation
curPage = 1
totalPage = 8

cols = [[0,64,128],[128,64,64]] # length needs to be the number of topics

btn_start_over = None

btn_previous_page = widgets.Button(description="Previous Page")
btn_previous_page.on_click(previous_page)

btn_next_page = widgets.Button(description="Next Page")
btn_next_page.on_click(next_page)

Engine



In [20]:

    
def start_engine():
    global stoplist, dict_raw_to_clean, filtered_documents, dictionary, corpus, tfidf, corpus_tfidf, model
    global doc1, doc2, doc3
    global btn_start_over
    
    # Run analysis
    from gensim.parsing.preprocessing import STOPWORDS
    #stoplist = set('on to and is up the a are to or in for it with your an i can of'.split())
    stoplist = STOPWORDS
    
    # For displaying purpose ONLY, create one-to-one mapping
    dict_raw_to_clean = {}
    for doc in documents:
        for w in doc.split(" "):
            if [stem_of_(word) for word in clean_words_in_(w) if word not in stoplist] != []:
                dict_raw_to_clean[w]=[stem_of_(word)
                                       for word in clean_words_in_(w)
                                       if word not in stoplist]

    # Real analysis (from Sean)
    # Remove stop words
    filtered_documents = [[stem_of_(word)
                       for word in clean_words_in_(document)
                       if word not in stoplist]
                      for document in documents]
    # Build dictionary
    dictionary = corpora.Dictionary(filtered_documents)
    # Convert to corpus
    corpus = [dictionary.doc2bow(document) for document in filtered_documents] # word index and frequency
    
    # Convert frequency to score, the more "unique" the word is, the higher the score:
    # It will take a vector and return another vector of the same dimensionality, except that 
    # features which were rare in the training corpus will have their value increased.
    tfidf = models.TfidfModel(corpus, normalize=True)
    
    corpus_tfidf = tfidf[corpus] # contains score information for each word index

    # Model
    #numpy.random.seed(0) # setting random seed to get the same results each time.
    # Train it several times: pick the model with the most purity
    model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics, passes = 10, iterations = 500) # train the model, get vectors
    purity = 0
    for idx, doc in enumerate(model[corpus_tfidf]):
        idx_list = [i[0] for i in doc]
        score_list = [i[1] for i in doc]
        purity += max(score_list)
    for n in range(100):
        temp_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics, passes = 10, iterations = 500)
        temp_purity = 0
        for idx, doc in enumerate(temp_model[corpus_tfidf]):
            idx_list = [i[0] for i in doc]
            score_list = [i[1] for i in doc]
            temp_purity += max(score_list)
        if (purity < temp_purity):
            model = temp_model
            purity = temp_purity
        
    
    # For new doc
    #vec_bow = [dictionary.doc2bow(doc)]
    #vec_tfidf = [tfidf[x] for x in vec_bow]
    #vec_lsi = [lsi[x] for x in vec_tfidf]
    
    #index = similarities.MatrixSimilarity(model[corpus_tfidf])
    #sims = [index[vec] for vec in model[corpus_tfidf]]
    #sims = [sorted(enumerate(sim), key=lambda item: -item[1]) for sim in sims]
    
    
    # Testing set
    doc1 = [stem_of_(word)
    for word in clean_words_in_(new_doc_1) if word not in stoplist]
    #color_words(model, doc1)
    doc2 = [stem_of_(word)
    for word in clean_words_in_(new_doc_2) if word not in stoplist]
    #color_words(model, doc2)
    doc3 = [stem_of_(word)
    for word in clean_words_in_(new_doc_3) if word not in stoplist]
    #color_words(model, doc3)
    
    btn_start_over = widgets.Button(description="Start")
    btn_start_over.on_click(start_over)
    display(btn_start_over)

Input and Start



In [21]:

    
numpy.random.seed(0) # setting random seed to get the same results each time.
# Training documents
#Business Analyst, Data Scientist, Statistician vs. Sales Associate, Retail Assistant Manager and Customer Service Associate
documents = ["A business analyst (unspecified type) examines sets of data and documents to make informed conclusions. Among other duties, a business analyst acts as a facilitator within an organization to make internal departments more efficient. This type of financial expert typically thrives in a high-stress environment and is quick to adapt to the ever-changing conditions of businesses. As a general rule, a business analyst (unspecified type) reports data conclusions to a supervisor so appropriate judgment based on data can be made. Almost any company can benefit from a business analyst; however, this line of work is more common on financial and administrative industries. A business analyst needs to have sharp critical-thinking skills to process huge amounts of information on a regular basis. A business analyst works mostly indoors in an office setting during business hours, although it is not uncommon for them to be \"on call\" or work overtime. It is a highly mental job that requires taking into consideration many business-specific variables. Besides strong mental ability, this profession typically requires a minimum of an associate's degree, with companies typically preferring a bachelor's or master's degree in business administration, economics, finance, or a related field.",
             "IT data scientists are responsible for mining complex data and providing systems-related advice for their organization. They design new ways to incorporate vast information with a focus on information technology topics. They work with teams of other IT professionals to manage statistical data and create different models based on the needs of their company. They possess advanced analytical skills, in addition to their exceptional oral and written communication abilities. They process research information for easier consumption and transform it into actionable plans. They also provide value to their businesses through their findings and thoughtful insights. IT data scientists follow specific, strict company and industry guidelines in their work. They observe data privacy rights to ensure client satisfaction and avoid legal issues. They create networks of professionals to consult, including internal partners and external colleagues. Most of the time data scientists work in teams using collaborative filtering, k-nearest neighbors, market basket analysis and matrix factorization methods. They deal with cutting edge technologies on a regular basis, and often have the best tools available at their disposal. One of their main work tools is usually an industrial computer with high processing power and proprietary software applications for research tasks.",
             "A statistician is a broad term for a professional involved in the field of either theoretical or applied statistics. Statisticians analyze a variety of data to supplement the knowledge of professionals or businesses in the private or public sector. Their work hinges on their ability to produce reliable data using appropriate, up-to-date methods; this can mean, for example, managing and executing surveys or ensuring that accurate data is obtained via other means such as reviewing pre-existing records or via interviews. Statisticians process and analyze the data using different mathematical techniques and specialized software, then summarizing the results into practical advice for their employer. These conclusions may be used to advise on strategy and assist immediate decision-making. The type of people that statisticians work with varies. In private industry, statisticians may work at improving a supply chain or helping a manufacturing company reach certain operational standards, for example. In the field of medicine, they may work with researchers to evaluate data on new medicine.",
             "Sales associates often work in department stores. They answering customers’ questions, solve product concerns, and sometimes act as cashiers. Sales associates can also work in fields like telemarketing and auto sales. Sales associates often enter sales data into computers, so it is important that associates be comfortable working with computers. Sales associates can either work part-time or full-time, and the hours can vary from business to business. In some cases, associates can have irregular hours. Since sales associates are working directly with people, they should have excellent customer service skills and be able to communicate effectively with people of all personality types. Sales associates need to be punctual, positive, willing to help, outgoing, polite, willing to go \"the extra mile\" for customers, knowledgeable, able to work on their own, and ready to seek out customers who might need help.",
             "Retail assistant managers support retail managers and/or store managers in the daily functions of a retail store or the retail area of a store. Retail assistant managers are responsible for the direction of subordinates, implementing the retail manager’s directions and maintaining levels of excellent customer service. At times, retail assistant managers will need to perform the duties of their superior when that supervisor is unavailable and often will also perform the duties of retail associates as well, covering breaks and providing additional coverage at peak sales times. Primary daily activities of retail assistant managers include answering product questions from both retail associates and customers, helping with purchase choices, and dealing with customer returns, exchanges and other customer concerns. Often, retail assistant managers will have the duty of ensuring store cleanliness, maintaining sales floor merchandise levels, ensuring store policies and procedures are met and other company expectations are followed.",
             "Customer service associates have a variety of duties, the most important of which is assisting customers with any troubles or inquiries that they might have. Often, customer service associates help customers over the phone, but they may also deal with customers in person. A customer service associate must possess a service-oriented attitude, as well as a friendly and personable attitude. They must be able to accurately read situations and address customer needs in a reliable and timely fashion. Often, customer service associates need to sit for long periods at a time, and they must be able to speak on the phone for long periods of time. Other duties required of customer service associates might include to word processing, data entry and other light office duties such as copying and filing. Customer service associates should possess a friendly upbeat personality, and they should be motivated, passionate, and love working with people."]
n_topics = 2

# Test model with new inputs
# Financial Analyst & Cashier
new_doc_1 = "Corporations and businesses typically have a responsibility to shareholders and owners to use earned income in a way that builds company wealth. A financial analyst carefully studies marketplace trends, demographics and microeconomic factors to help the company make smart investments. The financial analyst may also provide advice to companies on issuing their own bonds, splitting stock and other areas of concern. One of the most important roles for a financial analyst is to fully understand how and where a company has invested its resources, as well as how secure and viable that financial outlay will be going forward. An analyst needs to not only understand how current investments affect the company, but also how those investments and future financial interactions will impact short- and long-term growth. The analyst is expected to provide information on the company's current financial position and make recommendations to company decision-makers. For instance, the analyst may inform an executive board about whether expansion may be high risk or help the company decide on issuing bonds to cover capital improvements. The analyst may also provide advice and analysis on protecting a company's wealth in the short term during economic downturns." # should be all about topic 1
new_doc_2 = "A Cashier is an employee who operates a cash drawer by taking orders of customers and handling store receipts and money. Typically, a Cashier will be working a regular 8-hour workday. After clocking in, a Cashier's day begins. For most of the day, a Cashier will be stationed inside the employer's building at a particular cash register, the contents of which will be the Cashier's cash drawer. The cash drawer will be filled at the beginning of the day with a set amount of cash consisting of various denominations of bills and coins. The Cashier assigned to this cash register on this day will be responsible for healthily maintaining the level of cash in the drawer, as well as the necessary coins needed to dispense change to customers who pay with larger bills. A Cashier will also be fulfilling credit and debit card transactions and may also facilitate check payments, depending on the situation. The Cashier will be responsible for taking the customers' orders and payments, giving them back any change they are due, processing their payments, and delivering both the customers' orders (which the Cashier will bag for the convenience of the customer) and their sales receipts." # should be all about topic 0
# Ignore this for now
new_doc_3 = 'I could only afford an apple from local farm, as the iPhone and Apple Watch are way too expensive'

start_engine()









    




Now for the new documents, we do the similar thing:
corporation business typically responsibility shareholder owner use earned income way build company wealth financial analyst carefully study marketplace trend demographic microeconomic factor help company smart investment financial analyst provide advice company issuing bond splitting stock area concern important role financial analyst fully understand company invested resource secure viable financial outlay going forward analyst need understand current investment affect company investment future financial interaction impact short long term growth analyst expected provide information company's current financial position recommendation company decision maker instance analyst inform executive board expansion high risk help company decide issuing bond cover capital improvement analyst provide advice analysis protecting company's wealth short term economic downturn  
cashier employee operates cash drawer taking order customer handling store receipt money typically cashier working regular 8 hour workday clocking cashier's day begin day cashier stationed inside employer's building particular cash register content cashier's cash drawer cash drawer filled beginning day set cash consisting denomination bill coin cashier assigned cash register day responsible healthily maintaining level cash drawer necessary coin needed dispense change customer pay larger bill cashier fulfilling credit debit card transaction facilitate check payment depending situation cashier responsible taking customer order payment giving change processing payment delivering customer order cashier bag convenience customer sale receipt  
afford apple local farm iphone apple watch way expensive 

While the first two documents clearly belong to topic 1 and 0,     the third document is somewhat in between (as we expected).

8



In [ ]: